Machine Learning - UNH (Spring)

Oral Quiz 1

Read Files & Merge

library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
## read files and merge in one line (take advantage of similar file names)
all_quarters<- rbindlist(lapply(append(sprintf("q%d_16_sample.csv", 1:3), "q4_15_sample.csv"), read.csv, sep = " "))

names(all_quarters)
##  [1] "ITIN_ID"               "ORIGIN_AIRPORT_ID"    
##  [3] "ORIGIN_AIRPORT_SEQ_ID" "ORIGIN_CITY_MARKET_ID"
##  [5] "ORIGIN"                "ORIGIN_STATE_FIPS"    
##  [7] "DEST_AIRPORT_ID"       "DEST_AIRPORT_SEQ_ID"  
##  [9] "DEST_CITY_MARKET_ID"   "DEST"                 
## [11] "DEST_STATE_FIPS"       "AIRPORT_GROUP"        
## [13] "TK_CARRIER_CHANGE"     "TK_CARRIER_GROUP"     
## [15] "OP_CARRIER_CHANGE"     "OP_CARRIER_GROUP"     
## [17] "REPORTING_CARRIER"     "TICKET_CARRIER"       
## [19] "OPERATING_CARRIER"     "BULK_FARE"            
## [21] "MARKET_FARE"           "MARKET_DISTANCE"      
## [23] "DISTANCE_GROUP"        "MARKET_MILES_FLOWN"   
## [25] "NONSTOP_MILES"         "ITIN_GEO_TYPE"        
## [27] "MKT_GEO_TYPE"          "Year"                 
## [29] "Q"
dim(all_quarters)
## [1] 400000     29

Create New Features

Origin and destination state names

Scrape FIPS codes from US Census Bureau

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(stringr)
library(rvest)
## Warning: package 'xml2' was built under R version 3.3.2
fips_state_codes <- read_html("https://www.census.gov/geo/reference/ansi_statetables.html")


## hate that this isn't automated w/ function
states<- fips_state_codes %>% 
  html_nodes("table") %>%
  html_table() %>% .[[1]] %>%
  select(1, 2) %>% 
  rename(area_name = Name, fips_code = `FIPS State Numeric Code`)

outlying_territories<- fips_state_codes %>% 
  html_nodes("table") %>%
  html_table() %>% .[[2]] %>%
  select(1, 2) %>%
  rename(area_name = `Area Name`, fips_code = `FIPS State Numeric Code`) %>%
  .[-1,] ## remove first row - it's a long paragraph - don't need it

minor_outlying_territories<- fips_state_codes %>% 
  html_nodes("table") %>%
  html_table() %>% .[[3]] %>%
  select(1, 2) %>%
  rename(area_name = `Area Name`, fips_code = `FIPS State Numeric Code`)

all_fips_codes<- rbind(states, outlying_territories, minor_outlying_territories)

Create new columns: origin_state, dest_state

## create `origin_state`
names(all_fips_codes)<- c("origin_state", "ORIGIN_STATE_FIPS") ## change names for quick merge and column creation
all_fips_codes$ORIGIN_STATE_FIPS<- as.numeric(all_fips_codes$ORIGIN_STATE_FIPS)
all_quarters<- left_join(all_quarters, all_fips_codes, by = "ORIGIN_STATE_FIPS")

## create `dest_state`
names(all_fips_codes)<- c("dest_state", "DEST_STATE_FIPS") ## change names for quick merge and column creation
all_fips_codes$DEST_STATE_FIPS<- as.numeric(all_fips_codes$DEST_STATE_FIPS)
all_quarters<- left_join(all_quarters, all_fips_codes, by = "DEST_STATE_FIPS")

Create new columns: origin_city_state, dest_city_state

## read city lookup date from US DOT
city_lookup_table<- read.csv("L_CITY_MARKET_ID.csv")

## create `origin_city_state`
names(city_lookup_table)<- c("ORIGIN_CITY_MARKET_ID", "origin_city_state")
all_quarters<- left_join(all_quarters, city_lookup_table, by = "ORIGIN_CITY_MARKET_ID")

## create `dest_city_state`
names(city_lookup_table)<- c("DEST_CITY_MARKET_ID", "dest_city_state")
all_quarters<- left_join(all_quarters, city_lookup_table, by = "DEST_CITY_MARKET_ID")

Create new columns: cat_ITIN_ID, market_fare_quantile 1:4 for which quantile the price of that flight falls within

## bin ITIN_ID
all_quarters$bin_ITIN <- cut(all_quarters$ITIN_ID, breaks = quantile(all_quarters$ITIN_ID),labels=c("1","2","3","4"))

## bin market_fare
all_quarters$bin_Marketprice<-cut(all_quarters$MARKET_FARE,breaks = quantile(all_quarters$MARKET_FARE),labels = c("Low","Medium","High","Expensive"))

Return flights and One-way

x<-length(unique(all_quarters$ITIN_ID))
all_quarters$dups<-duplicated(all_quarters$ITIN_ID)
table(all_quarters$dups)
## 
##  FALSE   TRUE 
## 397415   2585
all_quarters$dups <- gsub("FALSE", "One-way", all_quarters$dups)
all_quarters$dups <- gsub("TRUE","Round-Trip",all_quarters$dups)
table(all_quarters$dups)
## 
##    One-way Round-Trip 
##     397415       2585

Create new columns: tk_carrier_number_of_changes

all_quarters$TK_CARRIER_GROUP <- as.character(all_quarters$TK_CARRIER_GROUP)

## separate TK_CARRIER_GROUP

## create tk_carrier_*
all_quarters_test<- separate(all_quarters, TK_CARRIER_GROUP, c("tk_carrier_1", "tk_carrier_2", "tk_carrier_3", "tk_carrier_4", "tk_carrier_5", "tk_carrier_6", "tk_carrier_7"), sep = ":")
## Warning: Too many values at 1 locations: 225970
## Warning: Too few values at 399997 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# Create columns tk_change_count_* --> 1 or 0 if changed from tk_carrier_n to tk_carrier_n + 1
for(i in 1:6){
all_quarters_test[, paste0("tk_change_count_", i)] <- ifelse(all_quarters_test[, paste0("tk_carrier_", i)] != all_quarters_test[, paste0("tk_carrier_", i+1)] & (!is.na(all_quarters_test[, paste0("tk_carrier_", i+1)])), 1, 0)
}

# Sum all tk_change_count to get the total number of changes 
all_quarters_test<- all_quarters_test %>%
  mutate(tk_carrier_number_of_changes = rowSums(.[, names(.) %in% sprintf("tk_change_count_%d", 1:6)]))

# remove tk_change_count_* - no longer needed
all_quarters_test <- all_quarters_test %>%
  select(-starts_with("tk_change_count_"))


## top 20 `tk_carrier_number_of_changes`
all_quarters_test %>% 
  select(tk_carrier_number_of_changes) %>%
  arrange(desc(tk_carrier_number_of_changes)) %>%
  .[1:50, ]
##  [1] 4 4 4 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

Visualizing the most common origin & destination pairs with ggmap

zips<- read.csv("zip_codes_states.csv")


## merge city, state into one columns to merge on
zips<- unite(zips, city_state, city, state, sep = ", ")

zips<- na.omit(zips)

## create lat & long of origins
lat_long_origin<- zips %>%
  select(city_state, latitude, longitude) %>%
  group_by(city_state) %>%
  summarise(latitude = mean(latitude), longitude = mean(longitude)) %>%
  rename(origin_city_state = city_state,
         origin_lat = latitude, origin_long = longitude)

## create lat & long of destinations
lat_long_dest<- zips %>%
  select(city_state, latitude, longitude) %>%
  group_by(city_state) %>%
  summarise(latitude = mean(latitude), longitude = mean(longitude)) %>%
  rename(dest_city_state = city_state,
         dest_lat = latitude, dest_long = longitude)


## get rid of "(Metropolitan Area)"
all_quarters$origin_city_state<- as.character(all_quarters$origin_city_state)
all_quarters$origin_city_state<- str_replace(all_quarters$origin_city_state, " \\(.*\\)", "")

all_quarters$dest_city_state<- as.character(all_quarters$dest_city_state)
all_quarters$dest_city_state<- str_replace(all_quarters$dest_city_state, " \\(.*\\)", "")

## replace New York City with New York
all_quarters$origin_city_state<- gsub("New York City", "New York", all_quarters$origin_city_state)
all_quarters$dest_city_state<- gsub("New York City", "New York", all_quarters$dest_city_state)
all_quarters$origin_city_state<- gsub("St.", "Saint", all_quarters$origin_city_state)
all_quarters$dest_city_state<- gsub("St.", "Saint", all_quarters$dest_city_state)
all_quarters$origin_city_state<- gsub("Kona", "Kailua Kona", all_quarters$origin_city_state)
all_quarters$dest_city_state<- gsub("Kona", "Kailua Kona", all_quarters$dest_city_state)
all_quarters$origin_city_state<- gsub("-", " ", all_quarters$origin_city_state)
all_quarters$dest_city_state<- gsub("-", " ", all_quarters$dest_city_state)
all_quarters$origin_city_state<- gsub("Ft.", "Fort", all_quarters$origin_city_state)
all_quarters$dest_city_state<- gsub("Ft.", "Fort", all_quarters$dest_city_state)

## remove everything the left of / on dual cities using basename(), meant for getting file name from path
all_quarters$origin_city_state<- basename(all_quarters$origin_city_state)
all_quarters$dest_city_state<- basename(all_quarters$dest_city_state)


all_quarters<- left_join(all_quarters, lat_long_origin, by = "origin_city_state")
all_quarters<- left_join(all_quarters, lat_long_dest, by = "dest_city_state")
library(ggmap)
map<-get_map(location='united states', zoom=4, maptype = 'terrain',
             source='google',color='color')

reasonable_prices<- all_quarters %>%
  filter(MARKET_FARE <= 600)

ggmap(map) + geom_point(
        aes(x = origin_long, y = origin_lat, colour = MARKET_FARE), 
        data = reasonable_prices, alpha = 0.8, na.rm = T) +
        scale_color_gradient(low="beige", high="blue")

ggmap(map) + geom_point(
        aes(x = dest_long, y = dest_lat, colour = MARKET_FARE), 
        data = reasonable_prices, alpha = 0.8, na.rm = T) +
        scale_color_gradient(low="beige", high="blue")

Top 50 Origin to Destinations Matches

library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.3.2
frequent_locations<- all_quarters %>%
  select(origin_city_state, dest_city_state, origin_lat, origin_long, dest_lat, dest_long)

top_n_pairs <- frequent_locations %>%
  group_by(origin_city_state, dest_city_state) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

top_n_pairs<- left_join(top_n_pairs, lat_long_origin, by = "origin_city_state")
top_n_pairs<- left_join(top_n_pairs, lat_long_dest, by = "dest_city_state")

## Top 10 
top_n_pairs %>% 
  select(1:3) %>%
  head(n = 10)
## Source: local data frame [10 x 3]
## Groups: origin_city_state [5]
## 
##    origin_city_state   dest_city_state count
##                <chr>             <chr> <int>
## 1       New York, NY   Los Angeles, CA  1581
## 2    Los Angeles, CA      New York, NY  1508
## 3          Miami, FL      New York, NY  1486
## 4       New York, NY         Miami, FL  1415
## 5    Los Angeles, CA San Francisco, CA  1334
## 6  San Francisco, CA   Los Angeles, CA  1283
## 7  San Francisco, CA      New York, NY  1273
## 8       New York, NY San Francisco, CA  1272
## 9        Chicago, IL      New York, NY  1165
## 10      New York, NY       Chicago, IL  1150
destination_map<- ggmap(map, extent = "device", legend = "topleft") + stat_density2d(
  aes(x = dest_long, y = dest_lat, fill = ..level..,
  alpha = ..level..), na.rm = T,
  size = 2, bins = 4, data = top_n_pairs,
  geom = "polygon") + guides(fill = FALSE, alpha = FALSE) +
  ggtitle("Destination Density")

destination_map

## nearly the EXACT SAME AS DESTINATION
origin_map<- ggmap(map, extent = "device", legend = "topleft") + stat_density2d(
  aes(x = origin_long, y = origin_lat, fill = ..level..,
  alpha = ..level..), na.rm = T,
  size = 2, bins = 4, data = top_n_pairs,
  geom = "polygon") + guides(fill = FALSE, alpha = FALSE) +
  ggtitle("Origin Density")

origin_map